# libraries
library(knitr)
library(timetk)
library(tidytext)
library(plotly)
library(ggwordcloud)
library(ggiraphExtra)
library(fs)
library(readr)
library(tidyverse)
library(tidymodels)
library(tidyquant)
library(recipes)
library(umap)
# Python integration
library(reticulate)
conda create -n py3.8 python=3.8 scikit-learn pandas numpy matplotlib This code does the following: - Creates a new Python environment called “py3.8” - Installs python version 3.8 - Installs the latest versions of scikit-learn, pandas, numpy, and matplotlib.
Replace this with your conda environment containing sklearn, pandas, & numpy
retail_order_tbl <- read_rds("00_Data/data_wranggled/trans_data/retail_order_tbl.rds")
products_manual_tbl <- read_rds("00_Data/data_wranggled/product_data/products_manual_tbl.rds")
step_2_split_test <- read_rds("00_Data/data_split/step_2_split_test.rds")
step_2_split_train <- read_rds("00_Data/data_split/step_2_split_train.rds")
train_data_tbl <- step_2_split_train$train
train_label_tbl <- step_2_split_train$test
Problems: - Products are highly unique and has no categories - Need categories to learn which customers are similar in purchasing behaviour
Solution: - Price Feature Engineering will help group products by price categories - Text Feature Engineering is essential to compare description fields - Stock Code Engineering will group products with similar catalogue
Hypothesis: - Similar Products Embeds Similar Word Token - Similar Products Have Similar Price Variations - Similar Products Have Similar Stock Codes (digits)
Algorithm: - Kmeans: Distance-Based Metric Learning - DBSCAN: Density-Based Metric Learning
Evaluation: - Summarise cluster result of its key components + comparative analysis - Perform double-clustering the result with customer purchase behaviour statistics (RFM metric) to evaluate these product category roles.
– Use ‘tidytext’ to unnest tokens – Stems the tokens using ‘hunspell’ to return only the root of the word
products_dictionary_tbl <- products_manual_tbl %>%
select(stock_code, mode_description) %>%
unnest_tokens(terms, mode_description, token = "words") %>%
filter(!(nchar(terms) == 1)) %>%
mutate(terms = hunspell::hunspell_stem(terms)) %>%
unnest(terms) %>%
mutate(n = 1)
– Count Frequency – Remove Stop words (e.g. “of” has no meaning) – Remove colour terms(e.g. “pink” has no meanings) – Remove terms with numbers (e.g. “130.5cm” has no meaning)
terms_frequency_tbl <- products_dictionary_tbl %>%
# Remove unnecessary terms
anti_join(stop_words, by = c("terms" = "word")) %>%
# Remove colour terms
filter(!terms %in% colours()) %>%
# Remove terms with numbers
filter(!terms %>% str_detect(pattern = "[0-9]")) %>%
# summarise -0> abstract frequency
group_by(terms) %>%
summarise(
n = sum(n)
) %>%
arrange(desc(n))
Plan:
source("00_Functions/Visualisation_function/visual.R")
rank_term_frequency_tbl <- terms_frequency_tbl %>%
get_rank_cumulative(.cumulative_max = 0.5)
rank_term_frequency_tbl %>% plot_rank_cumulative(max_slice = 500)
terms_frequency_tbl %>%
slice(1:100) %>%
mutate(terms = as_factor(terms) %>% fct_rev()) %>%
ggplot() +
geom_text_wordcloud_area(aes(label = terms, size = n, colour = n)) +
scale_size_area(max_size = 14) +
scale_colour_viridis_c(direction = -1) +
theme(plot.background = element_rect(fill = "black"),
panel.background = element_rect(fill = "black"))
top_50_terms <- terms_frequency_tbl %>% slice(1:50) %>% pull(terms)
product_term_feature_tbl <- products_dictionary_tbl %>%
filter(terms %in% top_50_terms) %>%
pivot_wider(
names_from = terms,
values_from = n,
values_fill = list(n = 0),
values_fn = list(n = sum)
)
Problems: - Price Variations - Highly Skewed (Critical damage for distance based algorithm)
Solution: - Implement Simple Statistics of product’s Price Range - Flexible Transformation (Data Pre-processing) -> Box-Cox or Yeo-Johnson
product_price_range_tbl <- train_data_tbl %>%
group_by(stock_code) %>%
summarise(
.groups = "drop",
n_prices = n(),
med_price = median(unit_price) %>% round(2),
mean_price = median(unit_price) %>% round(2),
min_price = min(unit_price),
p25_price = quantile(unit_price)[2],
p50_price = quantile(unit_price)[3],
p75_price = quantile(unit_price)[4],
max_price = max(unit_price),
range_price = ((max_price - min_price)/ mean_price) %>% round(2)
)
A stock code here will not be stock keeping unit. Depending on the type of inventory, retails include identifying information for everything from department to style, target customers, rountine, seasonal/occasional etc to identifies a product and helps business to track inventory for retail business.
product_code_tbl <- product_term_feature_tbl %>%
select(stock_code) %>%
mutate(code = str_replace(stock_code, "[a-zA-Z ]", "")) %>%
separate(col = code,
into = str_c("code_digit_", 6:1),
sep = "",
remove = FALSE) %>%
select(-code_digit_6) %>%
mutate_at(vars(starts_with("code_digit_")), funs(as.numeric)) %>%
select(-code)
Pre-processing steps Step_1: Remove near zero variance feature – This step will remove most of the tokenised term features with its variance below given frequency threshold Step_2: Yeo-Johnson generalised transformation will be operated to numeric price data with heavily skewness Step_3 & 4: kmeans & DBSCAN algorithm relies on the distance metrics (e.g. euclidean distance, density) hence it is important to center and scale the data
product_engineered_tbl <- product_term_feature_tbl %>%
left_join(product_code_tbl) %>%
left_join(product_price_range_tbl)
skewed_feature_names <- product_engineered_tbl %>%
select(contains("price")) %>%
colnames()
# Pre-processing steps for distance-based algorithms
recipe_obj <- recipe(stock_code ~ ., product_engineered_tbl) %>%
step_nzv(all_predictors()) %>%
step_YeoJohnson(skewed_feature_names) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
prep()
prep_product_cluster_tbl <- recipe_obj %>% juice() %>%
filter(complete.cases(.))
X_train <- prep_product_cluster_tbl %>% select(-stock_code)
Y_label <- prep_product_cluster_tbl %>% select(stock_code)
– Kmeans Clustering – Skree Plot: To find the optimal k number of clusters – 2-D UMAP: For visualisation puropose
kmeans_mapper <- function(center = 3, data = X_train) {
set.seed(42)
data %>%
kmeans(centers = center, nstart = 20)
}
centers_tbl <- tibble(centers = 1:20)
k_means_mapped_tbl <- centers_tbl %>%
mutate(k_means = centers %>% map(kmeans_mapper),
glance = k_means %>% map(broom::glance))
– Optimal k-cluster ~ 10
plot_kmeans_skree <- function(kmeans_map_data, .metric = tot.withinss){
metric_expr <- enquo(.metric)
g <- kmeans_map_data %>%
unnest(glance) %>%
ggplot(aes(x = centers, y = !! metric_expr), colour = "#2c3e50") +
geom_point(colour = "#2c3e50", size = 3) +
geom_line(colour = "#2c3e50", size = 1) +
ggrepel::geom_label_repel(aes(label = centers), colour = "#2c3e50", size = 4) +
theme_tq()
return(g)
}
k_means_mapped_tbl %>% plot_kmeans_skree()
UMAP – 2-dimensional Visualisation
umap_results <- X_train %>%
umap()
umap_results_tbl <- umap_results$layout %>%
as_tibble() %>%
setNames(c("x", "y")) %>%
cbind(X_train)
k_means_obj <- k_means_mapped_tbl %>%
filter(centers == 10) %>%
pull(k_means) %>% pluck(1)
umap_kmeans_results_tbl <- k_means_obj %>% broom::augment(X_train) %>% bind_cols(Y_label) %>%
select(stock_code, .cluster) %>%
bind_cols(umap_results_tbl)
umap_kmeans_results_tbl %>%
ggplot(aes(x, y, colour = .cluster)) +
geom_point(alpha = 0.5) +
theme_tq() +
scale_colour_tq()
kmeans Result Output + Save
kmeans_fed_terms <- umap_kmeans_results_tbl %>%
select(!contains("price")) %>%
select(!contains("digit")) %>%
select(-.cluster, -x, -y, -stock_code) %>%
colnames()
kmeans_10_result_tbl <- umap_kmeans_results_tbl %>%
select(stock_code, .cluster, x, y) %>%
left_join(product_engineered_tbl %>% select(stock_code, kmeans_fed_terms)) %>%
left_join(product_engineered_tbl %>% select(stock_code, contains("price"))) %>%
left_join(product_engineered_tbl %>% select(stock_code, contains("digit"))) %>%
left_join(products_manual_tbl %>% select(-median_unit_price))
product_cluster_tbl <- kmeans_10_result_tbl
product_disctionary <- function(data,
.document = mode_description){
document_expr <- enquo(.document)
out_tbl <- data %>%
unnest_tokens(terms, !! document_expr, token = "words") %>%
mutate(terms = hunspell::hunspell_stem(terms)) %>%
unnest(terms) %>%
# Remove stop words
anti_join(stop_words, by = c("terms" = "word")) %>%
filter(!terms %in% colours()) %>%
filter(!terms %>% str_detect(pattern = "[0-9]")) %>%
mutate(n = 1)
return(out_tbl)
}
term_freq_by_cluster_tbl <- product_cluster_tbl %>%
product_disctionary() %>%
# Group by cluster & select top N terms
group_by(.cluster, terms) %>%
summarise(n = sum(n)) %>%
arrange(desc(n), by_group = TRUE) %>%
slice(1:50)
g <- term_freq_by_cluster_tbl %>%
slice(1:10) %>%
ungroup() %>%
arrange(desc(n)) %>%
mutate(terms = terms %>% as_factor %>% fct_reorder(n)) %>%
ggplot(aes(n, terms, colour = .cluster)) +
geom_point() +
expand_limits(x = 0) +
facet_wrap(~.cluster, ncol = 5, scales = "free_y") +
theme_tq() +
scale_colour_tq() +
labs(y = "") +
theme(legend.position = "none")
ggplotly(g)
umap_kmeans_results_tbl <- read_rds("00_Data/data_engineered/umap_kmeans_results_tbl.rds")
umap_kmeans_results_tbl %>%
group_by(.cluster) %>%
summarise_at(vars(heart:range_price), .funs = mean) %>%
select(.cluster, heart:candle, med_price, mean_price, max_price, range_price, contains("digit_")) %>%
plot_facet_radar(colour = .cluster, .facet_vars = .cluster)
– kmeans k = 10 (optimal observed by tot.withness) result table – Final Product Cluster Table
write_rds(umap_kmeans_results_tbl, "00_Data/data_engineered/umap_kmeans_results_tbl.rds")
write_rds(product_cluster_tbl, "00_Data/data_engineered/product_cluster_tbl.rds")
dump("product_disctionary", "00_Functions/Iterative_analysis/product_disctionary.R")